* ==============================================================================
// STEP 1: Load data on driving per day per driving period
* ==============================================================================

/* 
The dataset "driving_periods" has one observation per 
(driving period, driving sub-period)-combination.

A driving period is the time between two EU controls (i.e. odometer readings),
or from the first registration date to the first EU control. The time stamps stored in
"eu_start" and "eu_end" marks the beginning and the end of each driving period.

Each driving period is split into sub-period whenever (a) the car changes owner,
or (b) the car-owning household makes other changes in their car portfolio.
Hence, household-level car ownership will always be constant within each sub-period.
"hh_start" and "hh_end" marks the beginning and the end of each driving sub-period.

- "familie_id" is the sub-period specific household identifier
- "carid" id the period specific car identifier
- "carid_secondary" is the sub-period specific identifier of the newest car owned 
   by the household in addition to the car identified by "carid"
   
 */
 
use koere_periode_id 			/// Driving period ID (A driving period is the time betw. two EU controls)
	sub_period 					/// Driving sub-period ID (see lines 13-16)
	km_dag 						/// Driving per day ((Odometer reading at EU control minus odometer reading at previous EU control) divided by days between EU controls)
	eu_start 					/// Date driving period starts
	eu_end 						/// Date driving period ends
	hh_start 					/// Date driving sub-period starts
	hh_end 						/// Date driving sub-period ends
	days_eu 					/// Lenght of driving period (days between EU controls)
	days_hh 					/// Length of driving sub-period
	familie_id 					/// sub-period specific household ID variable (of the household that owns the car)
	cars 						/// sub-period specific number of cars owned by the household
	carid 						/// driving period specific car ID
	age_car 					/// car age
	diesel 						/// diesel (dummy)
	electric 					/// electric (dummy)
	price_per_km 				/// Fuel efficiency
	carid_secondary 			/// sub-period specific ID variable for "familie_id"'s other car
	age_car_secondary 			/// Age of the other car
	diesel_secondary 			/// Diesel (dummy) for other car
	electric_secondary 			/// Electric (dummy) for other car
	price_per_km_secondary 		/// Fuel efficiency 
	ctrl_nr 					/// How many odometer readings the car has been to
	freg_DMY 					/// First registration date of the car
	freg_DMY_secondary 			/// First registration date of the other car
	using "${datain}driving_periods" , clear
compress 	
	
/* =============================================================================
// STEP 2: Dropping invalid observations, and only include cars up to 25 years old
============================================================================= */
drop if (age_car == . | price_per_km == .) 	/// /* missing car attr */
	| (age_car_secondary == . & cars > 1) 	/// /* missing secondary car attr */
	| (days_eu < 365*2-150) 				/// /* eu control period too short */
	| (days_eu > 365*4 + 150) 				/// /* eu control period too long */
	| (age_car > 25) 							/* car more than 25 years old */


/* households with the youngest secondary car above 25 years should not count as two-car households */
foreach var in age_car_secondary diesel_secondary ///
	electric_secondary price_per_km_secondary freg_DMY_secondary {
		replace `var' = . if (age_car_secondary > 25 & age_car_secondary != .)
}
replace carid_secondary = "" if (age_car_secondary > 25 & age_car_secondary != .)
replace cars = 1 if (age_car_secondary > 25 & age_car_secondary != .)

* The share of the driving period each sub-period covers:
gen weight_full_period = (hh_end - hh_start)/(eu_end - eu_start)
* The share of the driving period that is covered by sub-periods that are "valid" (still kept in the data)
bys koere_periode_id: egen weight_valid_sum = sum(weight_full_period)
* If less than 70 percent of a driving period consists of valid sub-periods, drop it
drop if weight_valid_sum < 0.7 
* Drop the auxiliary variable
drop weight_valid_sum 
* Drop sub-periods starting before 2005 (i.e. a time period not covered by the data)
drop if year(hh_start) <= 2005

/* =============================================================================
// STEP 3: Expand the dataset from (period,sub-period)-level 
   to (period, sub-period, year)-level
============================================================================= */
* The reason we want to do this, is to merge on annual data on the household level

* Start by generating a new ID variable to denote (period,sub-period) combinations
gen double eventid = _n
* How many years are covered by each driving sub-period?
gen years_temp = year(hh_end) - year(hh_start) + 1
* Expand each sub-period "year" number of times
expand years_temp
* Drop auxiliary variable
drop years_temp
* Generating "year" variable
bysort eventid: gen year = year(hh_start[1]) + _n - 1
/* == Generating "days": number of days each year the driving sub-period covers */
bysort eventid (year): /// Number of days the first year of an event
	gen days = min(hh_end, mdy(12,31,year)) - hh_start + 1 if _n == 1
bysort eventid (year): /// Number of days the last year of an event
	replace days = hh_end - mdy(1,1,year) if _n == _N & missing(days)
* Number of days in the middle (number of days of the year)
replace days = mdy(12,31,year) - mdy(1,1,year) + 1 if missing(days)
compress

/* =============================================================================
// STEP 4: Prepare household level data for merging
============================================================================= */
preserve	
	use year familienr couple children antpers_i_regstat_famnr secondhome ///
		employed1 employed2 retired1 retired2 age1 age2 ///
		wies1_decile wies2_decile wealth1_decile  wealth2_decile ///
		wies1 wies2 wealth1 wealth2 /// 
		grputd1 grputd2 dist1 dist2 time1 time2 ///
		ptl1 ptl2 toll1 toll2 ///
		toll_fam_mean ptl_fam_km_mean ///
		grkrets_num grk_bed1_num grk_bed2_num ///
		PublicTransitTime_1 PublicTransitTime_2 ///
		PublicVSCarTime_fam_mean PublicDiffCarTime_fam_mean ///
		VENTETID_1 BOARDINGS_1 TAKST_1 VENTETID_2 BOARDINGS_2 TAKST_2 ///
		using "${dataout}MainDataset", clear
	* Ensure family-id variable has the same format as in the driving dataset
	gen familie_id = substr(familienr,2,.)
	drop familienr
	destring familie_id, replace				
	tempfile hhdata
	save `hhdata', replace
restore

/* =============================================================================
// STEP 5: Merge annual household level data to 
   (driving period,sub-period,year) observations
============================================================================= */
merge m:1 familie_id year using `hhdata', keep(match)
drop _merge
compress

/* =============================================================================
// STEP 6: Collapsing the data 
   - From (driving period,sub-period,year) observations
   - To (driving period, household) observations
============================================================================= */

/* Collapsing together all events with the same household as owner 
   Using the days of the year covered as weight.
*/ 
		
* Unit of observation in final dataset: combination of unique driving periods and car owners
egen collapseid = group(koere_periode_id familie_id)

/*
Since "collapse" treats missing as zeros, we replace "missing" by the average
value (weighted by number of days, as in the "collapse" command)
*/
foreach var in time1 time2 dist1 dist2 ptl1 ptl2 toll1 toll2 toll_fam_mean ptl_fam_km_mean PublicTransitTime_1 PublicTransitTime_2 PublicVSCarTime_fam_mean PublicDiffCarTime_fam_mean VENTETID_1 BOARDINGS_1  TAKST_1  VENTETID_2 BOARDINGS_2 TAKST_2  {
		/* weighted mean */
		egen num = total(`var' * days * !missing(`var',days)), by(collapseid)
		egen den = total(        days * !missing(`var',days)), by(collapseid)
		replace `var' = num/den if `var' == .
		drop num den
}

bys collapseid: egen aux = mean(age_car_secondary)
replace age_car_secondary = aux if age_car_secondary == .
drop aux

* Dropping all observations where data on the commute is missing
drop if (dist1 == . & age1 != .) 				/// /* HH member 1 exists, but has missing distance */
	| (dist2 == . & age2 != .) 					/// /* HH member 2 exists, but has missing distance */
	| (PublicTransitTime_1 == . & age1 != .) 	/// /* HH member 1 exists, but has missing PT info */
	| (PublicTransitTime_2 == . & age2 != .)   		/* HH member 2 exists, but has missing PT info */

/* Sorting by days so that the basic statistical unit ("grk") with highest
   number of days is chosen */
gsort collapseid -days    
replace days_hh = days 
replace electric_secondary = 0 if electric_secondary == .  
gen secondary = (carid_secondary != "")

/* === Collapsing the data ================================================== */   
collapse ///
	(first) koere_periode_id km_dag eu_start eu_end /// 
		days_eu familie_id carid diesel /// 
		electric ///
		carid_secondary /// 
		ctrl_nr freg_DMY freg_DMY_secondary /// 
		wies1_decile wies2_decile wealth1_decile wealth2_decile /// New variables
		grputd1 grputd2 grkrets_num grk_bed1_num grk_bed2_num ///
	(min) hh_start ///
	(max) hh_end ///
	(rawsum) days_hh ///	
	(mean) couple cars age_car price_per_km age_car_secondary secondary electric_secondary /// Car variables
		price_per_km_secondary ///
		children antpers_i_regstat_famnr secondhome /// hh variables
		employed1 employed2 retired1 retired2 age1 age2 ///
		dist1 dist2 time1 time2 ptl1 ptl2 toll1 toll2 ///
		toll_fam_mean ptl_fam_km_mean ///
		PublicTransitTime_1 PublicTransitTime_2 PublicVSCarTime_fam_mean ///
		PublicDiffCarTime_fam_mean ///
		VENTETID_1 BOARDINGS_1  TAKST_1  VENTETID_2 BOARDINGS_2 TAKST_2 ///
		wies1 wies2 wealth1 wealth2 ///
	[fw = days] /// weight for hh variables
	, by(collapseid)
	
compress

* driving period-specific identifier for the new definition of sub-periods
bys koere_periode_id (hh_start): gen subperiod = _n	
order subperiod, a(koere_periode_id)

/* =============================================================================
// STEP 6: Create a matrix of (first YM, last YM)-specific fuel prices
   For all potential combinations of (first YM, last YM)
============================================================================= */
preserve 
	* Load data on monthly fuel prices (diesel, gasoline and kwh)
	import excel "${datain}fuel_prices_taxes.xlsx" ///
		, sheet("Sheet1") firstrow clear
	* Only keep the relevant years
	keep if year >= 2005 & year <= 2017	
	* Only keep the relevant variables
	keep year month diesel dtax_sum gasoline gtax_sum kwh kwh_tax
	rename dtax_sum diesel_tax
	rename gtax_sum gasoline_tax

	* Index variable for year-month-combinations
	sort year month
	gen yearmonth = _n
	
	* Initializing variables
	gen start = .
	gen end = .
	gen diesel_mean = .
	gen diesel_tax_mean = .
	gen gasoline_mean = .
	gen gasoline_tax_mean = .
	gen kwh_mean = .
	gen kwh_tax_mean = .
	
	* Number of unique YM-combinations
	qui sum yearmonth 
	local T = r(max)

	/* Number of unique (first YM, last YM)-combinations */
	local setobs = 1
	forvalues t = 1(1)`T' {
		forvalues s = `t'(1)`T'{
			local setobs = `setobs' + 1
		}
	}
	set obs `setobs'
	/* ======================================== */

	local n = 1 										// observation count: from 1 to `setobs'
	forvalues t = 1(1)`T' { 							// looping over all potential (first YM)
		forvalues s = `t'(1)`T'{ 						// looping over all potential (last YM)
			qui replace start = `t' in `n' 				// start YM
			qui replace end = `s' in `n' 				// end YM
			qui sum diesel if yearmonth >= `t' & yearmonth <= `s'
			qui replace diesel_mean = r(mean) in `n' 	// Avg diesel price
			qui sum gasoline if yearmonth >= `t' & yearmonth <= `s'
			qui replace gasoline_mean = r(mean) in `n' 	// Avg gasoline price
			qui sum kwh if yearmonth >= `t' & yearmonth <= `s'
			qui replace kwh_mean = r(mean) in `n' 		// Avg kwh price
			
			qui sum diesel_tax if yearmonth >= `t' & yearmonth <= `s'
			qui replace diesel_tax_mean = r(mean) in `n'   // avg diesel tax
			qui sum gasoline_tax if yearmonth >= `t' & yearmonth <= `s'
			qui replace gasoline_tax_mean = r(mean) in `n' // avg gasoline tax
			qui sum kwh_tax if yearmonth >= `t' & yearmonth <= `s'
			qui replace kwh_tax_mean = r(mean) in `n'	   // avg kwh tax
			local n = `n' + 1							// Go to next line
			if `t' == `s' display `t' " of " `T'
		}
	}
	tempfile fuel_prices
	save `fuel_prices', replace
	
	/* Adding (start YM, end YM) indices for merging */	
	keep yearmonth year month 
	rename yearmonth start
	rename year start_year
	rename month start_month
	tempfile start
	save `start'
	
	rename start end
	rename start_year end_year
	rename start_month end_month
	tempfile end
	save `end'
	
	use `fuel_prices', clear
	merge m:m start using `start', keep(match master) // adding start_year and start_month
	drop _merge

	merge m:m end using `end', keep(match master) // adding end_year and end_month
	drop _merge
	drop if start_year == .
	
	/* Dropping the variables from the original dataset, only keeping start-end combinations*/
	drop year month gasoline gasoline_tax diesel diesel_tax kwh kwh_tax yearmonth start end

	rename diesel_mean diesel_price
	rename gasoline_mean gasoline_price
	rename kwh_mean kwh_price 

	save `fuel_prices', replace
restore 

/* =============================================================================
// STEP 7: Merge fuel prices to driving data
============================================================================= */
/* Note - fuel prices are averages within the 
   EU control period, not the household ownership period */
   
gen start_year = year(eu_start) 	// used for merging
gen end_year = year(eu_end) 		// used for merging
gen start_month = month(eu_start) 	// used for merging
gen end_month = month(eu_end) 		// used for merging	

merge m:1 start_year start_month end_year end_month using `fuel_prices' ///
	, keep(match master)
* Dropping auxiliary variables
drop start_month start_year end_month end_year _merge
compress

/* =============================================================================
// STEP 8: Last data selection and variable creation before analysis
============================================================================= */

* === Dropping missing observations and unnaturally long commutes ==============
drop if PublicTransitTime_1 == .
drop if PublicTransitTime_2 == .
keep if dist1 < 150 | dist1 == .
keep if dist2 < 150 | dist2 == .
keep if time1<=120 | time1==.     
keep if time2<=120 | time2==.     

* === Generating "weight" variable =============================================

* Share of driving period the car is owned by the household 
* - We only keep observations where weight > 0.5 in the final regressions
* - This means we keep *at most* one observation per driving period

gen weight = (hh_end - hh_start + 1) / (eu_end - eu_start + 1)

* === Generating "year" variables ==============================================
* Definition: share of the year contained in the driving period
forvalues i = 2005(1)2017 {
	/* 1 if year is contained in the EU control period */
	gen year_`i' = (year(eu_start) <= `i' & year(eu_end) >= `i')
	/* If last year of EU control */
	replace year_`i' = (eu_end - mdy(1,1,`i') + 1) / (eu_end - eu_start + 1) ///
		if year_`i' == 1 & year(eu_end) == `i' & year(eu_start) < `i'
	/* If first year of EU control */	
	replace year_`i' = (mdy(12,31,`i') - eu_start + 1) / (eu_end - eu_start + 1) ///
		if year_`i' == 1 & year(eu_start) == `i' & year(eu_end) > `i'
	/* If year is in the middle of EU control */	
	replace year_`i' = (mdy(12,31,`i') - mdy(1,1,`i') + 1) / (eu_end - eu_start + 1) ///
		if year_`i' == 1 & year(eu_start) < `i' & year(eu_end) > `i'
}

* === Generating "month" variables =============================================
* Definition: Share of driving period corresponding to a certain month

* Get start and end MY of driving period
gen eu_start_MY = ym(year(eu_start),month(eu_start))
gen eu_end_MY = ym(year(eu_end),month(eu_end))
* Initialize "month" variables
forvalues m = 1(1)12 {
	gen month_`m' = 0
}
* For each year: for each month within that year: add number of days covered by driving period
forvalues i = 2005(1)2017 {
	forvalues m = 1(1)12 {
		if `m' < 12 {
			/* If the month is AFTER start month and BEFORE end month */
			replace month_`m' = month_`m' + mdy(`m'+1,1,`i') - mdy(`m',1,`i') ///
				if eu_start_MY < ym(`i',`m') & eu_end_MY > ym(`i',`m')
			/* If the month is EQUAL TO start month and BEFORE end month */	
			replace month_`m' = month_`m' + mdy(`m'+1,1,`i') - eu_start ///
				if eu_start_MY == ym(`i',`m') & eu_end_MY > ym(`i',`m')
		}	
		if `m' == 12 {
			/* If the month is AFTER start month and BEFORE end month */
			replace month_`m' = month_`m' + 31 ///
				if eu_start_MY < ym(`i',`m') & eu_end_MY > ym(`i',`m')
			/* If the month is EQUAL TO start month and BEFORE end month */	
			replace month_`m' = month_`m' + mdy(`m',31,`i')+ 1 - eu_start ///
				if eu_start_MY == ym(`i',`m') & eu_end_MY > ym(`i',`m')	
		}
		/* If the month is AFTER start month and EQUAL TO end month */
		replace month_`m' = month_`m' + eu_end - mdy(`m',1,`i') + 1 ///
			if eu_start_MY < ym(`i',`m') & eu_end_MY == ym(`i',`m') 
		/* If the month is EQUAL TO start month and EQUAL TO end month */	
		replace month_`m' = month_`m' + eu_end - eu_start + 1 ///
			if eu_start_MY == ym(`i',`m') & eu_end_MY == ym(`i',`m') 	
	}
}
* Divide by number of days covered by the driving period in total
* This makes "month_{m}" share variables
forvalues m = 1(1)12 {
	replace month_`m' = month_`m' / (eu_end - eu_start + 1)
}
drop eu_start_MY eu_end_MY // drop auxiliary variables

* === Generating LHS variable: log of daily driving ============================
gen lnkm = ln(km_dag)

* If car is not diesel or electric, it is gasoline 
gen gas = (1 - diesel - electric)
compress

/* =============================================================================
// STEP 9: Save data
============================================================================= */

save "${dataout}DrivingDataset.dta", replace
clear

/* Variables that will be used in subsequent analyses:

children antpers_i_regstat_famnr secondhome 
employed1 employed2 retired1 retired2 age1 age2
wies1_decile wies2_decile wealth1_decile wealth2_decile
grputd1 grputd2
dist1 dist2 time1 time2
grkrets_num grk_bed1_num grk_bed2_num 
PublicTransitTime_1 PublicTransitTime_2
VENTETID_1 BOARDINGS_1  TAKST_1  VENTETID_2 BOARDINGS_2 TAKST_2
days_eu days_hh weight
diesel gas electric
diesel_price gasoline_price kwh_price
lnkm toll_fam_mean ptl_fam_km_mean 
year_* month_* weight
km_dag cars ctrl_nr

- Other variables can be dropped from the data without problems
*/

* ==============================================================================
